FIFA 22 PLAYERS ANALYSIS

Data and exploration

Data

fifa <- read.csv("players_22.csv")
#head(fifa, 3)

Missing values

missing.values <- fifa %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing == T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing))

missing.values %>%
    ggplot() +
    geom_bar(aes(x = key, y = num.missing), stat = "identity") +
    labs(x = "variable", y = "number of missing values", title = "Number of missing values") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

Duplicates

sprintf("Number of duplicates columns:")
## [1] "Number of duplicates columns:"
sum(duplicated(fifa$long_name) == TRUE)
## [1] 20
sprintf("Dimension  before")
## [1] "Dimension  before"
dim(fifa)
## [1] 19239   110
fifa <- fifa %>% distinct(long_name, .keep_all = TRUE)
sprintf("Dimension  after")
## [1] "Dimension  after"
dim(fifa)
## [1] 19219   110

Data Cleaning

fifa <- fifa %>%
    mutate(wage_eur = ifelse(is.na(wage_eur),
        median(wage_eur, na.rm = T),
        wage_eur
    )) %>%
    mutate(value_eur = ifelse(is.na(value_eur),
        median(value_eur, na.rm = T),
        value_eur
    )) %>%
    mutate(passing = ifelse(is.na(passing),
        median(passing, na.rm = T),
        passing
    )) %>%
    mutate(dribbling = ifelse(is.na(dribbling),
        median(dribbling, na.rm = T),
        dribbling
    )) %>%
    mutate(pace = ifelse(is.na(pace),
        median(pace, na.rm = T),
        pace
    )) %>%
    mutate(defending = ifelse(is.na(defending),
        median(defending, na.rm = T),
        defending
    )) %>%
    mutate(shooting = ifelse(is.na(shooting),
        median(shooting, na.rm = T),
        shooting
    )) %>%
    mutate(physic = ifelse(is.na(physic),
        median(physic, na.rm = T),
        physic
    ))

Visaulizing Amounts

league1 <- table(fifa$league_name)
league2=as.data.frame(league1)
league3 <- league2 %>% 
  mutate(Freq = as.numeric(Freq)) %>% 
  arrange(desc(Freq)) %>%
  slice(1:5)
par(las=2)
par(mar=c(5,10,4,2))

barplot(league3$Freq,names.arg=c("USA Major League Soccer", "Argentina Primera División",
                                 "English League Championship", "English Premier League", "Spain Primera Division"), cex.names=0.8,xlab="Number of Players",col="blue",
main="Top 5 Leagues with Highest Number of Players",border="red", horiz=TRUE)

Where are the players from?

fifa_country_count <- fifa %>%
    group_by(nationality_name) %>%
    summarize(Freq = n())

fifa_country_count$nationality_name[fifa_country_count$nationality_name 
                                    == "United States"] <- "United States of America"
fifa_country_count
## # A tibble: 163 × 2
##    nationality_name     Freq
##    <chr>               <int>
##  1 Afghanistan             1
##  2 Albania                46
##  3 Algeria                51
##  4 Andorra                 1
##  5 Angola                 17
##  6 Antigua and Barbuda     3
##  7 Argentina             960
##  8 Armenia                 7
##  9 Australia             266
## 10 Austria               318
## # … with 153 more rows
library(plotly)
country_with_code <- read.csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/slim-3/slim-3.csv")
head(country_with_code)
##             name alpha.3 country.code
## 1    Afghanistan     AFG            4
## 2  Ã…land Islands     ALA          248
## 3        Albania     ALB            8
## 4        Algeria     DZA           12
## 5 American Samoa     ASM           16
## 6        Andorra     AND           20
fifa_country_count_with_code <- fifa_country_count %>%
    left_join(country_with_code,
        by = c("nationality_name" = "name")
    )

fig <- plot_ly(fifa_country_count_with_code,
    type = "choropleth",
    locations = fifa_country_count_with_code$alpha.3,
    z = fifa_country_count_with_code$Freq,
    text = fifa_country_count_with_code$nationality_name,
    colorscale = "ice"
)

fig <- fig %>% colorbar(title = "No of players")
fig <- fig %>% layout(
    title = "Choropleth showing FIFA 2022 Players' Nationality <br>(Hover for breakdown)"
)

fig

Visaulizing Proportions

nation1 <- table(fifa$nationality_name)
nation2=as.data.frame(nation1)
nation3 <- nation2 %>% 
  mutate(Freq = as.numeric(Freq)) %>% 
  arrange(desc(Freq)) %>%
  slice(1:5)
pct <- round(nation3$Freq/sum(nation3$Freq)*100)
nation3$Var1 <- paste(pct,"%",sep="") 
colors <- c("grey", "blue","green", "yellow", "red")
pie(nation3$Freq,labels = nation3$Var1, col = colors,
    main="Top 5 Countries with Highest Number of Players", cex=0.8)
legend(1.2, .5, c("England", "Germany", "Spain", "France", "Argentina"), fill = colors) 

Age Distribution

ggplot(fifa, aes(x = age)) +
    geom_boxplot() +
    labs(
        title = "The Distribution of Age",
        x = "Age"
    )

Oldest Players

old <- fifa %>%
    arrange(desc(age)) %>%
    select(short_name, nationality_name, age)
head(old)
##     short_name nationality_name age
## 1     K. Miura            Japan  54
## 2    G. Buffon            Italy  43
## 3 C. Lucchetti        Argentina  43
## 4  S. Nakamura            Japan  43
## 5      D. Vaca          Bolivia  42
## 6   K. Ellison          England  42

Youngest Players

young <- fifa %>%
    arrange(age) %>%
    select(short_name, nationality_name, age)
head(young)
##          short_name nationality_name age
## 1              Gavi            Spain  16
## 2          V. Barco        Argentina  16
## 3 A. Kalogeropoulos           Greece  16
## 4              Yayo            Spain  16
## 5   R. van den Berg      Netherlands  16
## 6           A. Musi          Romania  16

Some line charts with age

Players Attributes vs. Age

filtered_attributes <- fifa %>%
    group_by(age) %>%
    summarise_at(
        vars(overall, potential, pace),
        list(mean)
    )
filtered_attributes
## # A tibble: 29 × 4
##      age overall potential  pace
##    <int>   <dbl>     <dbl> <dbl>
##  1    16    55.8      75.9  68.5
##  2    17    56.6      74.8  67.9
##  3    18    58.0      74.1  68.5
##  4    19    59.2      73.3  68.3
##  5    20    60.7      72.5  68.9
##  6    21    63.2      72.7  69.4
##  7    22    64.0      72.4  70.0
##  8    23    65.7      72.5  70.5
##  9    24    66.6      71.9  70.5
## 10    25    67.6      71.1  70.3
## # … with 19 more rows
library("reshape2")
data_long <- melt(filtered_attributes, id = "age")

ggplot(
    data_long,
    aes(
        x = age,
        y = value,
        color = variable
    )
) +
    geom_line() +
    labs(
        y = "Player Atrributes Rating",
        x = "Age",
        color = "Color Legend",
        title = "Attributes vs Age",
    )

Players Value vs. Age

money_attributes <- fifa %>%
    group_by(age) %>%
    summarise_at(
        vars(value_eur),
        list(mean)
    )

ggplot(money_attributes, aes(x = age, y = value_eur)) +
    geom_line() +
    labs(
        y = "Player Value in Millions",
        x = "Age",
        title = "Player value vs Age",
    ) +
    scale_y_continuous(labels = label_number(suffix = " M", scale = 1e-6)) # thousands

Modeling

hist(fifa$overall, xlab="overall Rating", main = "Histogram of Overall Rating")

fifa$overall = log10(fifa$overall)


hist(fifa$overall, xlab = "Log10 of Overall Rating", main = "Histogram of Overall Rating- Log10")

hist(fifa$wage_eur, xlab = "Wage in Euros", main = "Histogram of Wage")

fifa$wage_eur = log10(fifa$wage_eur)

#fifa$wage_eur = log(fifa$overall)
hist(fifa$overall, xlab="Wage in Euros", main="Histogram of Wage- Log10")

Train / test

  • Create an initial split
dt <- sort(sample(nrow(fifa), nrow(fifa) * .7))
train <- fifa[dt, ]
test <- fifa[-dt, ]

train <- train %>% filter(!is.na(overall))
test <- test %>% filter(!is.na(overall))
overall_individualskills <- lm(overall ~ pace + shooting + passing + dribbling + defending +
                                 physic + age + preferred_foot + attacking_crossing + attacking_finishing + attacking_heading_accuracy + 
                                 attacking_short_passing + attacking_volleys + skill_dribbling +skill_curve +skill_fk_accuracy +
                                 skill_long_passing +skill_ball_control +movement_acceleration +movement_sprint_speed +
                                 movement_agility +movement_reactions +movement_balance +power_shot_power +power_jumping+ power_stamina +
                                 power_strength +power_long_shots +mentality_aggression +mentality_interceptions +mentality_positioning +
                                 mentality_vision +mentality_penalties +mentality_composure +defending_marking_awareness +defending_standing_tackle +
                                 defending_sliding_tackle, data = train)

summary(overall_individualskills)
## 
## Call:
## lm(formula = overall ~ pace + shooting + passing + dribbling + 
##     defending + physic + age + preferred_foot + attacking_crossing + 
##     attacking_finishing + attacking_heading_accuracy + attacking_short_passing + 
##     attacking_volleys + skill_dribbling + skill_curve + skill_fk_accuracy + 
##     skill_long_passing + skill_ball_control + movement_acceleration + 
##     movement_sprint_speed + movement_agility + movement_reactions + 
##     movement_balance + power_shot_power + power_jumping + power_stamina + 
##     power_strength + power_long_shots + mentality_aggression + 
##     mentality_interceptions + mentality_positioning + mentality_vision + 
##     mentality_penalties + mentality_composure + defending_marking_awareness + 
##     defending_standing_tackle + defending_sliding_tackle, data = train)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.097935 -0.009787  0.000010  0.010223  0.094991 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.437e+00  2.140e-03 671.663  < 2e-16 ***
## pace                         8.007e-05  5.697e-05   1.405 0.159924    
## shooting                    -2.228e-03  1.882e-04 -11.840  < 2e-16 ***
## passing                      1.869e-04  1.093e-04   1.711 0.087173 .  
## dribbling                    1.983e-03  1.562e-04  12.692  < 2e-16 ***
## defending                    2.351e-03  1.562e-04  15.053  < 2e-16 ***
## physic                      -2.428e-04  7.689e-05  -3.159 0.001589 ** 
## age                          7.849e-04  3.976e-05  19.742  < 2e-16 ***
## preferred_footRight         -2.582e-03  3.533e-04  -7.307 2.88e-13 ***
## attacking_crossing           2.026e-04  2.981e-05   6.795 1.13e-11 ***
## attacking_finishing          1.120e-03  8.725e-05  12.840  < 2e-16 ***
## attacking_heading_accuracy   1.487e-04  2.475e-05   6.009 1.91e-09 ***
## attacking_short_passing      5.811e-04  4.706e-05  12.348  < 2e-16 ***
## attacking_volleys            1.369e-05  2.424e-05   0.565 0.572090    
## skill_dribbling             -6.984e-04  7.608e-05  -9.179  < 2e-16 ***
## skill_curve                 -9.598e-05  2.257e-05  -4.252 2.13e-05 ***
## skill_fk_accuracy           -2.607e-05  2.008e-05  -1.299 0.194121    
## skill_long_passing          -2.347e-04  3.135e-05  -7.486 7.52e-14 ***
## skill_ball_control           4.855e-04  5.558e-05   8.737  < 2e-16 ***
## movement_acceleration        2.942e-04  3.644e-05   8.073 7.45e-16 ***
## movement_sprint_speed        1.676e-04  3.890e-05   4.309 1.65e-05 ***
## movement_agility            -1.655e-04  2.574e-05  -6.430 1.32e-10 ***
## movement_reactions           2.153e-03  2.971e-05  72.450  < 2e-16 ***
## movement_balance            -2.470e-04  2.066e-05 -11.953  < 2e-16 ***
## power_shot_power             8.340e-04  4.436e-05  18.800  < 2e-16 ***
## power_jumping                5.397e-05  1.613e-05   3.345 0.000824 ***
## power_stamina                2.120e-04  2.617e-05   8.100 5.97e-16 ***
## power_strength               3.443e-04  3.682e-05   9.351  < 2e-16 ***
## power_long_shots             1.579e-04  4.470e-05   3.533 0.000412 ***
## mentality_aggression        -3.608e-05  2.295e-05  -1.572 0.115967    
## mentality_interceptions     -6.039e-04  4.110e-05 -14.696  < 2e-16 ***
## mentality_positioning       -3.453e-04  2.706e-05 -12.761  < 2e-16 ***
## mentality_vision            -7.748e-05  2.741e-05  -2.826 0.004718 ** 
## mentality_penalties          1.295e-04  2.286e-05   5.663 1.52e-08 ***
## mentality_composure          5.667e-04  2.495e-05  22.719  < 2e-16 ***
## defending_marking_awareness -4.961e-04  5.006e-05  -9.911  < 2e-16 ***
## defending_standing_tackle   -5.847e-04  5.939e-05  -9.845  < 2e-16 ***
## defending_sliding_tackle    -2.556e-04  3.922e-05  -6.516 7.47e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01663 on 13415 degrees of freedom
## Multiple R-squared:  0.8685, Adjusted R-squared:  0.8682 
## F-statistic:  2395 on 37 and 13415 DF,  p-value: < 2.2e-16
wage_individualskills <- lm(wage_eur ~ pace + shooting + passing + dribbling + defending +
                              physic + age + preferred_foot + attacking_crossing + attacking_finishing + attacking_heading_accuracy +
                              attacking_short_passing + attacking_volleys + skill_dribbling +skill_curve +skill_fk_accuracy +
                              skill_long_passing +skill_ball_control +movement_acceleration +movement_sprint_speed +movement_agility +
                              movement_reactions +movement_balance +power_shot_power +power_jumping+ power_stamina +
                              power_strength +power_long_shots +mentality_aggression +mentality_interceptions +mentality_positioning +
                              mentality_vision +mentality_penalties +mentality_composure +defending_marking_awareness +defending_standing_tackle +
                              defending_sliding_tackle, data = train)

summary(wage_individualskills)
## 
## Call:
## lm(formula = wage_eur ~ pace + shooting + passing + dribbling + 
##     defending + physic + age + preferred_foot + attacking_crossing + 
##     attacking_finishing + attacking_heading_accuracy + attacking_short_passing + 
##     attacking_volleys + skill_dribbling + skill_curve + skill_fk_accuracy + 
##     skill_long_passing + skill_ball_control + movement_acceleration + 
##     movement_sprint_speed + movement_agility + movement_reactions + 
##     movement_balance + power_shot_power + power_jumping + power_stamina + 
##     power_strength + power_long_shots + mentality_aggression + 
##     mentality_interceptions + mentality_positioning + mentality_vision + 
##     mentality_penalties + mentality_composure + defending_marking_awareness + 
##     defending_standing_tackle + defending_sliding_tackle, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6740 -0.2350  0.0160  0.2624  1.3419 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -0.1012120  0.0495589  -2.042 0.041145 *  
## pace                        -0.0010307  0.0013196  -0.781 0.434752    
## shooting                    -0.0173958  0.0043590  -3.991 6.62e-05 ***
## passing                     -0.0018153  0.0025307  -0.717 0.473201    
## dribbling                    0.0176433  0.0036188   4.875 1.10e-06 ***
## defending                    0.0266347  0.0036178   7.362 1.92e-13 ***
## physic                      -0.0027722  0.0017809  -1.557 0.119593    
## age                         -0.0093548  0.0009210 -10.158  < 2e-16 ***
## preferred_footRight         -0.0123259  0.0081832  -1.506 0.132030    
## attacking_crossing           0.0026669  0.0006906   3.862 0.000113 ***
## attacking_finishing          0.0077875  0.0020210   3.853 0.000117 ***
## attacking_heading_accuracy   0.0040710  0.0005732   7.102 1.29e-12 ***
## attacking_short_passing      0.0022091  0.0010900   2.027 0.042716 *  
## attacking_volleys            0.0023400  0.0005614   4.168 3.09e-05 ***
## skill_dribbling             -0.0028624  0.0017622  -1.624 0.104324    
## skill_curve                  0.0020949  0.0005228   4.007 6.19e-05 ***
## skill_fk_accuracy           -0.0013095  0.0004650  -2.816 0.004868 ** 
## skill_long_passing          -0.0008253  0.0007262  -1.136 0.255807    
## skill_ball_control           0.0015643  0.0012873   1.215 0.224301    
## movement_acceleration        0.0016593  0.0008441   1.966 0.049339 *  
## movement_sprint_speed        0.0035286  0.0009010   3.916 9.04e-05 ***
## movement_agility            -0.0011727  0.0005963  -1.967 0.049233 *  
## movement_reactions           0.0233212  0.0006883  33.883  < 2e-16 ***
## movement_balance            -0.0016850  0.0004786  -3.521 0.000431 ***
## power_shot_power             0.0091222  0.0010275   8.878  < 2e-16 ***
## power_jumping                0.0010828  0.0003737   2.898 0.003765 ** 
## power_stamina               -0.0011059  0.0006062  -1.824 0.068130 .  
## power_strength               0.0025534  0.0008529   2.994 0.002760 ** 
## power_long_shots            -0.0000397  0.0010354  -0.038 0.969411    
## mentality_aggression         0.0018055  0.0005316   3.396 0.000685 ***
## mentality_interceptions     -0.0073270  0.0009519  -7.697 1.49e-14 ***
## mentality_positioning       -0.0035675  0.0006268  -5.692 1.28e-08 ***
## mentality_vision             0.0004527  0.0006350   0.713 0.475894    
## mentality_penalties          0.0012220  0.0005295   2.308 0.021023 *  
## mentality_composure          0.0086118  0.0005778  14.905  < 2e-16 ***
## defending_marking_awareness -0.0055073  0.0011595  -4.750 2.06e-06 ***
## defending_standing_tackle   -0.0065354  0.0013756  -4.751 2.05e-06 ***
## defending_sliding_tackle    -0.0022125  0.0009084  -2.436 0.014879 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3852 on 13415 degrees of freedom
## Multiple R-squared:  0.5699, Adjusted R-squared:  0.5688 
## F-statistic: 480.5 on 37 and 13415 DF,  p-value: < 2.2e-16
overalldata <- data.frame(actual= test$overall, predicted = predict(overall_individualskills, test))
wagedata <- data.frame(actual= test$wage_eur, predicted = predict(wage_individualskills, test))
#colSums(is.na(data2))
m2wage <- mean((wagedata$actual- wagedata$predicted)^2)
m2wage
## [1] 0.1543064
m2overall <-  mean((overalldata$actual - overalldata$predicted)^2)
m2overall
## [1] 0.0002738323
ggplot(wagedata, aes(x = predicted, y = actual)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1) +
  labs(x='Transformed Predicted Wage Values', y='Transformed Actual Wage Values',
       title='Transformed Predicted vs. Transformed Actual Wage Values')

ggplot(overalldata, aes(x = predicted, y = actual)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1) +
  labs(x='Transformed Predicted Overall Values', y='Transformed Actual Overall Values',
       title='Transformed Predicted vs. Transformed Actual Overall Values')

Ghana players cluster

# filter out players from team Ghana who arenot reserves or substitutes
ghana <- fifa %>%
    filter(fifa$nationality_name == "Ghana" & fifa$club_position != "RES" &
             fifa$club_position != "SUB") %>%
    select(short_name, shooting, pace, passing, dribbling, defending)

# drop duplicates if any
ghana <- ghana %>% distinct(short_name, .keep_all = TRUE)

# make player name as rownames
rownames(ghana) <- ghana$short_name
ghana2 <- ghana %>% select(2:5)

# perform k means clustering with 10 centers
k2 <- kmeans(ghana2, centers = 10, nstart = 25)
# str(k2)

fviz_cluster(k2, data = ghana2)

Current Portugal Team Cluster

portugal_current <- c(
    "André Silva", "Antonio Silva", "Bernardo Silva", "Bruno Fernandes",
    "Cristiano Ronaldo", "Danilo Pereira", "Diogo Costa", "Diogo Dalot",
    "Gonçalo Ramos", "João Cancelo", "João Félix", "João Mário", "Palhinha",
    "José Sá", "Matheus Nunes", "Nuno Mendes", "Otavio", "Pepe", "Rafael Leão",
    "Raphael Guzzo", "Ricardo Horta", "Rúben Dias", "Ruben Neves", "Rui Patrício",
    "Vitinha", "William Carvalho"
)

portugal <- fifa %>%
    filter(fifa$nationality_name == "Portugal") %>%
    select(short_name, shooting, pace, passing, dribbling, defending)

# filter out the players in current national team
portugal <- portugal[portugal$short_name %in% portugal_current, ]

# drop duplicates if any
portugal <- portugal %>% distinct(short_name, .keep_all = TRUE)

# make player name as rownames
rownames(portugal) <- portugal$short_name
portugal2 <- portugal %>% select(2:5)

# perform k means clustering with 5 centers
k2 <- kmeans(portugal2, centers = 5, nstart = 25)
# str(k2)
fviz_cluster(k2, data = portugal2)

res <- hcut(portugal2, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = c("#00AFBB","#2E9FDF", "#E7B800", "#FC4E07"))